##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
GTEX_file<-"/media/theron/My_Passport/TCGA_junctions/ext_dat/GTEx_recount3_selection_2021-09-14_17_25_41.csv"
TCGA_file<-"/media/theron/My_Passport/TCGA_junctions/ext_dat/TCGA_recount3_selection_2021-09-14_17_27_07.csv"
GTEX_samples <- read.table(GTEX_file,sep=",",header=T)
TCGA_samples <- read.table(TCGA_file,sep=",",header=T)
all_samples<-rbind(GTEX_samples,TCGA_samples)
STUDY_ABREVIATION<-c("LAML","ACC","BLCA","LGG","BRCA","CESC","CHOL","LCML","COAD","CNTL","ESCA","FPPP","GBM","HNSC","KICH","KIRC","KIRP","LIHC","LUAD","LUSC","DLBC","MESO","MISC","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THYM","THCA","UCS","UCEC","UVM")
STUDY_NAME<-c("Acute Myeloid Leukemia","Adrenocortical carcinoma","Bladder Urothelial Carcinoma","Brain Lower Grade Glioma","Breast invasive carcinoma","Cervical squamous cell carcinoma and endocervical adenocarcinoma","Cholangiocarcinoma","Chronic Myelogenous Leukemia","Colon adenocarcinoma","Controls","Esophageal carcinoma","FFPE Pilot Phase II","Glioblastoma multiforme","Head and Neck squamous cell carcinoma","Kidney Chromophobe","Kidney renal clear cell carcinoma","Kidney renal papillary cell carcinoma","Liver hepatocellular carcinoma","Lung adenocarcinoma","Lung squamous cell carcinoma","Lymphoid Neoplasm Diffuse Large B-cell Lymphoma","Mesothelioma","Miscellaneous","Ovarian serous cystadenocarcinoma","Pancreatic adenocarcinoma","Pheochromocytoma and Paraganglioma","Prostate adenocarcinoma","Rectum adenocarcinoma","Sarcoma","Skin Cutaneous Melanoma","Stomach adenocarcinoma","Testicular Germ Cell Tumors","Thymoma","Thyroid carcinoma","Uterine Carcinosarcoma","Uterine Corpus Endometrial Carcinoma","Uveal Melanoma")
STUDY_ABREVIATION_NAMES=data.frame(STUDY_ABREVIATION,STUDY_NAME)
colnames(STUDY_ABREVIATION_NAMES)<-c("ABREVIATION","NAME")
all_samples$DESC <- unlist(lapply(all_samples$project,function(proj){
a<-which(STUDY_ABREVIATION_NAMES$ABREVIATION==proj)
if (length(a)==0){
return("")
} else {
return(STUDY_ABREVIATION_NAMES$NAME[a])
}
}))
write.table(data.frame(all_samples$project),
file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/projects.txt",
sep="\t",
quote=F,
col.names=F,
row.names=F)
STUDY_ABREVIATION<-c("LAML","ACC","BLCA","LGG","BRCA","CESC","CHOL","LCML","COAD","ESCA","GBM","HNSC","KICH","KIRC","KIRP","LIHC","LUAD","LUSC","DLBC","MESO","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THYM","THCA","UCS","UCEC","UVM")
GTEX_map<-c("BLOOD,BONE_MARROW","ADRENAL_GLAND","BLADDER","BRAIN","BREAST","CERVIX_UTERI","LIVER","BLOOD","COLON","ESOPHAGUS","BRAIN","ESOPHAGUS,SALIVARY_GLAND","KIDNEY","KIDNEY","KIDNEY","LIVER","LUNG","LUNG","BLOOD","LUNG","FALLOPIAN_TUBE","PANCREAS","ADRENAL_GLAND","PROSTATE","COLON","ADIPOSE_TISSUE","SKIN","STOMACH","TESTIS","?","THYROID","UTERUS","UTERUS","?")
TCGA_GTEX_map<-data.frame(STUDY_ABREVIATION,GTEX_map)
colnames(TCGA_GTEX_map)<-c("TCGA","GTEX")
write.table(TCGA_GTEX_map,
file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/TCGA_GTEX_map.txt",
sep="\t",
quote=F,
col.names=T,
row.names=F)
STUDY_ABREVIATION<-c("LAML","ACC","BLCA","LGG","BRCA","CESC","CHOL","COAD","ESCA","GBM","HNSC","KICH","KIRC","KIRP","LIHC","LUAD","LUSC","DLBC","MESO","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THYM","THCA","UCS","UCEC","UVM")
TCGA_dir <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers"
for (i in seq(length(STUDY_ABREVIATION))){
if (i == 1){
all_metadata <- readRDS(sprintf("%s/%s/%s_metadata.rds",TCGA_dir,STUDY_ABREVIATION[i],STUDY_ABREVIATION[i]))
}
fill <- readRDS(sprintf("%s/%s/%s_metadata.rds",TCGA_dir,STUDY_ABREVIATION[i],STUDY_ABREVIATION[i]))
all_metadata<-rbind(all_metadata,fill)
}
saveRDS(all_metadata,file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/TCGA_recount3_metadata.rds")
GTEX_dir <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/GTEX"
human_projects <- available_projects()
## 2021-10-14 10:40:09 caching file sra.recount_project.MD.gz.
## 2021-10-14 10:40:11 caching file gtex.recount_project.MD.gz.
## 2021-10-14 10:40:13 caching file tcga.recount_project.MD.gz.
proj_info <- subset(
human_projects,
file_source %in% c("gtex")
)
GTEX_STUDIES<-proj_info$project
for (i in seq(length(GTEX_STUDIES))){
if (i == 1){
all_metadata <- readRDS(sprintf("%s/%s/%s_metadata.rds",GTEX_dir,GTEX_STUDIES[i],GTEX_STUDIES[i]))
}
fill <- readRDS(sprintf("%s/%s/%s_metadata.rds",GTEX_dir,GTEX_STUDIES[i],GTEX_STUDIES[i]))
all_metadata<-rbind(all_metadata,fill)
}
saveRDS(all_metadata,file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/GTEX_recount3_metadata.rds")
TCGA_metadata<-readRDS("/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/TCGA_recount3_metadata.rds")
GTEX_metadata<-readRDS("/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/GTEX_recount3_metadata.rds")
mut_dat <- read_excel("/media/theron/My_Passport/TCGA_junctions/ext_dat/13073_2017_424_MOESM3_ESM_coded.xlsx")
mut_dat_complete<-mut_dat[complete.cases(mut_dat),]
The TCGA MAF summary file
maf_file <- "/media/theron/My_Passport/TCGA_junctions/maf_summary.txt"
mc3_maf = read.table(maf_file,header=T)
mc3_maf$Tumor_Sample_ID <- vapply(TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,sample=T),
function(val){substr(val,1,nchar(val)-1)},
character(1))
rownames(mc3_maf) <- mc3_maf$Tumor_Sample_Barcode
mc3_maf$participant_ID <- TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,participant=T)
junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)
tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()
optitype_calls <- read.table("/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv",header=T,sep=",")
which_are_correct<-vapply(optitype_calls$aliquot_id,function(ID){
a<-strsplit(ID,"-")[[1]]
if (nchar(a[1]) > 4){
return(F)
} else {
return(T)
}
},logical(1))
optitype_calls_correct <- optitype_calls[which_are_correct,]
optitype_calls_correct$particpant_ID <- TCGAbarcode(optitype_calls_correct$aliquot_id,participant=T)
cluster_metrics_tum <- data.frame(cancers)
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
if (cancer %in% nogos){next}
tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
mc3_maf_small<-subset(mc3_maf,participant_ID %in% tumor_meta$participant_ID)
mc3_maf_small <- mc3_maf_small[complete.cases(mc3_maf_small),]
mc3_maf_small$type <- vapply(rownames(mc3_maf_small),function(barcode){
type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
if (type <= 9){
return("T")
} else if (type > 9 & type <= 19){
return ("N")
} else {
return ("C")
}
},character(1))
mc3_maf_small$TMB<-log10(mc3_maf_small$total+1)
mc3_maf_small <- mc3_maf_small %>% dplyr::filter(type == "T")
tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
tumor_geno <- read.table(tumor_geno_file,header=T)
summ_file <- read.table(sprintf("%s/summaries.txt",tumor_dir))
summ_file <- summ_file$V1
per_sample_summ <- c()
for (file in summ_file){
summ <- read.table(file)
per_sample_summ <- c(per_sample_summ,summ$V1)
}
per_sample_summ <- data.frame(per_sample_summ,tumor_geno$sample_id,tumor_geno$type)
colnames(per_sample_summ) <- c("per_sample_summ","sample_id","type")
per_sample_summ <- unique(per_sample_summ)
rownames(mc3_maf_small) <- mc3_maf_small$Tumor_Sample_ID
per_sample_summ$TMB <- mc3_maf_small[per_sample_summ$sample_id,"TMB"]
per_sample_summ <- per_sample_summ %>% dplyr::filter(type == "T" & per_sample_summ > 0)
cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm"] <- median(per_sample_summ$per_sample_summ)
cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(per_sample_summ$TMB)
print(ggplot(per_sample_summ,aes(x=log10(per_sample_summ),y=TMB))+
geom_point()+
stat_cor(method = "spearman")+
geom_smooth(method="lm")+
labs(title=sprintf(cancer)))
}
## [1] "1 out of 19"
## [1] "BLCA"
## `geom_smooth()` using formula 'y ~ x'
## [1] "2 out of 19"
## [1] "BRCA"
## Warning: Removed 77 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing missing values (geom_point).
## [1] "3 out of 19"
## [1] "CHOL"
## `geom_smooth()` using formula 'y ~ x'
## [1] "4 out of 19"
## [1] "COAD"
## `geom_smooth()` using formula 'y ~ x'
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## `geom_smooth()` using formula 'y ~ x'
## [1] "8 out of 19"
## [1] "KICH"
## `geom_smooth()` using formula 'y ~ x'
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## `geom_smooth()` using formula 'y ~ x'
## [1] "11 out of 19"
## [1] "LIHC"
## `geom_smooth()` using formula 'y ~ x'
## [1] "12 out of 19"
## [1] "LUAD"
## `geom_smooth()` using formula 'y ~ x'
## [1] "13 out of 19"
## [1] "LUSC"
## `geom_smooth()` using formula 'y ~ x'
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## `geom_smooth()` using formula 'y ~ x'
## [1] "17 out of 19"
## [1] "READ"
## `geom_smooth()` using formula 'y ~ x'
## [1] "18 out of 19"
## [1] "THCA"
## `geom_smooth()` using formula 'y ~ x'
## [1] "19 out of 19"
## [1] "UCEC"
## `geom_smooth()` using formula 'y ~ x'
ggplot(cluster_metrics_tum,aes(x=log10(splice_imm+1),y=TMB,label=cancers))+
geom_text()+
stat_cor(method = "spearman")+
xlab("splicing antigenicity")
## Warning: Removed 6 rows containing non-finite values (stat_cor).
## Warning: Removed 6 rows containing missing values (geom_text).
splicing_factor_genes<-read_excel("/media/theron/My_Passport/TCGA_junctions/ext_dat/splicing_factor_genes1.xlsx")
splicing_factor_genes<-toupper(splicing_factor_genes$Gene)
write.table(data.frame(splicing_factor_genes),
file="/media/theron/My_Passport/TCGA_junctions/ext_dat/splicing_factor_genes.txt",
sep="\t",
quote=F,
col.names=F,
row.names=F)
splice_maf<-read.maf("/media/theron/My_Passport/TCGA_junctions/splice_factor.maf")
## -Reading
## -Validating
## --Removed 36209 duplicated variants
## -Silent variants: 214408
## -Summarizing
## --Possible FLAGS among top ten genes:
## USH2A
## OBSCN
## HMCN1
## -Processing clinical data
## --Missing clinical data
## -Finished in 00:01:18 elapsed (00:01:39 cpu)
splice_maf_samp<-getSampleSummary(splice_maf)
splice_maf_samp$Tumor_Sample_ID <- TCGAbarcode(as.character(splice_maf_samp$Tumor_Sample_Barcode),sample=T)
splice_maf_samp$participant_ID <- TCGAbarcode(as.character(splice_maf_samp$Tumor_Sample_Barcode),participant=T)
rownames(splice_maf_samp)<-splice_maf_samp$Tumor_Sample_Barcode
splice_maf_samp<-data.frame(splice_maf_samp)
junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)
tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()
optitype_calls <- read.table("/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv",header=T,sep=",")
which_are_correct<-vapply(optitype_calls$aliquot_id,function(ID){
a<-strsplit(ID,"-")[[1]]
if (nchar(a[1]) > 4){
return(F)
} else {
return(T)
}
},logical(1))
optitype_calls_correct <- optitype_calls[which_are_correct,]
optitype_calls_correct$particpant_ID <- TCGAbarcode(optitype_calls_correct$aliquot_id,participant=T)
cluster_metrics_tum <- data.frame(cancers)
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
if (cancer %in% nogos){next}
tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
splice_maf_samp_small<-subset(splice_maf_samp,participant_ID %in% tumor_meta$participant_ID)
splice_maf_samp_small <- splice_maf_samp_small[complete.cases(splice_maf_samp_small),]
rownames(splice_maf_samp_small)<-as.character(splice_maf_samp_small$Tumor_Sample_Barcode)
splice_maf_samp_small$type <- vapply(rownames(splice_maf_samp_small),function(barcode){
type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
if (type <= 9){
return("T")
} else if (type > 9 & type <= 19){
return ("N")
} else {
return ("C")
}
},character(1))
splice_maf_samp_small$TMB<-log10(splice_maf_samp_small$total+1)
splice_maf_samp_small <- splice_maf_samp_small %>% dplyr::filter(type == "T")
tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
tumor_geno <- read.table(tumor_geno_file,header=T)
summ_file <- read.table(sprintf("%s/summaries.txt",tumor_dir))
summ_file <- summ_file$V1
per_sample_summ <- c()
for (file in summ_file){
summ <- read.table(file)
per_sample_summ <- c(per_sample_summ,summ$V1)
}
per_sample_summ <- data.frame(per_sample_summ,tumor_geno$sample_id,tumor_geno$type)
colnames(per_sample_summ) <- c("per_sample_summ","sample_id","type")
per_sample_summ <- unique(per_sample_summ)
rownames(splice_maf_samp_small) <- vapply(splice_maf_samp_small$Tumor_Sample_ID,function(val){substr(val,1,nchar(val)-1)},character(1))
per_sample_summ$TMB <- splice_maf_samp_small[per_sample_summ$sample_id,"TMB"]
per_sample_summ <- per_sample_summ %>% dplyr::filter(type == "T" & per_sample_summ > 0)
cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm"] <- median(per_sample_summ$per_sample_summ,na.rm=T)
cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(per_sample_summ$TMB,na.rm=T)
print(ggplot(per_sample_summ,aes(x=log10(per_sample_summ),y=TMB))+
geom_point()+
stat_cor(method = "spearman")+
geom_smooth(method="lm")+
labs(title=sprintf(cancer)))
}
## [1] "1 out of 19"
## [1] "BLCA"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "2 out of 19"
## [1] "BRCA"
## Warning: Removed 82 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 82 rows containing non-finite values (stat_smooth).
## Warning: Removed 82 rows containing missing values (geom_point).
## [1] "3 out of 19"
## [1] "CHOL"
## `geom_smooth()` using formula 'y ~ x'
## [1] "4 out of 19"
## [1] "COAD"
## `geom_smooth()` using formula 'y ~ x'
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## `geom_smooth()` using formula 'y ~ x'
## [1] "8 out of 19"
## [1] "KICH"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## `geom_smooth()` using formula 'y ~ x'
## [1] "11 out of 19"
## [1] "LIHC"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "12 out of 19"
## [1] "LUAD"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "13 out of 19"
## [1] "LUSC"
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## Warning: Removed 8 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
## [1] "17 out of 19"
## [1] "READ"
## `geom_smooth()` using formula 'y ~ x'
## [1] "18 out of 19"
## [1] "THCA"
## Warning: Removed 31 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 31 rows containing non-finite values (stat_smooth).
## Warning: Removed 31 rows containing missing values (geom_point).
## [1] "19 out of 19"
## [1] "UCEC"
## `geom_smooth()` using formula 'y ~ x'
ggplot(cluster_metrics_tum,aes(x=log10(splice_imm+1),y=TMB,label=cancers))+
geom_text()+
stat_cor(method = "spearman")+
xlab("splicing antigenicity")
## Warning: Removed 5 rows containing non-finite values (stat_cor).
## Warning: Removed 5 rows containing missing values (geom_text).
splice_maf_data <- splice_maf@data
splice_maf_data <- splice_maf_data %>% dplyr::filter(Hugo_Symbol %in% splicing_factor_genes)
splice_maf_data <- splice_maf_data[,c("Hugo_Symbol","Variant_Classification","Tumor_Sample_Barcode")]
splice_maf_data$Tumor_Sample_ID <- vapply(TCGAbarcode(as.character(splice_maf_data$Tumor_Sample_Barcode),sample=T),function(val){substr(val,1,nchar(val)-1)},character(1))
splice_maf_data$participant_ID <- TCGAbarcode(as.character(splice_maf_data$Tumor_Sample_Barcode),participant=T)
splice_maf_data$cancer <- NA
splice_maf_data$splice_ant <- NA
splice_maf_data_fill <- splice_maf_data
splice_maf_data_fill$cancer <- as.character(splice_maf_data_fill$cancer)
splice_maf_data_fill$splice_ant <- as.numeric(splice_maf_data_fill$splice_ant)
junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)
tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()
optitype_calls <- read.table("/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv",header=T,sep=",")
which_are_correct<-vapply(optitype_calls$aliquot_id,function(ID){
a<-strsplit(ID,"-")[[1]]
if (nchar(a[1]) > 4){
return(F)
} else {
return(T)
}
},logical(1))
optitype_calls_correct <- optitype_calls[which_are_correct,]
optitype_calls_correct$particpant_ID <- TCGAbarcode(optitype_calls_correct$aliquot_id,participant=T)
cluster_metrics_tum <- data.frame(cancers)
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
if (cancer %in% nogos){next}
tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
tumor_geno <- read.table(tumor_geno_file,header=T)
summ_file <- read.table(sprintf("%s/summaries.txt",tumor_dir))
summ_file <- summ_file$V1
per_sample_summ <- c()
for (file in summ_file){
summ <- read.table(file)
per_sample_summ <- c(per_sample_summ,summ$V1)
}
per_sample_summ <- data.frame(per_sample_summ,tumor_geno$sample_id,tumor_geno$type)
colnames(per_sample_summ) <- c("per_sample_summ","sample_id","type")
per_sample_summ <- unique(per_sample_summ)
aa <- vapply(seq(nrow(per_sample_summ)),function(row_val){
ID <- per_sample_summ$sample_id[row_val]
splice_ant <- per_sample_summ$per_sample_summ[row_val]
locs <- which(splice_maf_data_fill$Tumor_Sample_ID == ID)
splice_maf_data_fill[locs,"cancer"] <<- cancer
splice_maf_data_fill[locs,"splice_ant"] <<- splice_ant
return(T)
},logical(1))
}
## [1] "1 out of 19"
## [1] "BLCA"
## [1] "2 out of 19"
## [1] "BRCA"
## [1] "3 out of 19"
## [1] "CHOL"
## [1] "4 out of 19"
## [1] "COAD"
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## [1] "8 out of 19"
## [1] "KICH"
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## [1] "11 out of 19"
## [1] "LIHC"
## [1] "12 out of 19"
## [1] "LUAD"
## [1] "13 out of 19"
## [1] "LUSC"
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## [1] "17 out of 19"
## [1] "READ"
## [1] "18 out of 19"
## [1] "THCA"
## [1] "19 out of 19"
## [1] "UCEC"
splice_maf_data_fill <- splice_maf_data_fill[complete.cases(splice_maf_data_fill),]
splice_maf_data_fill <- splice_maf_data_fill %>% dplyr::filter(splice_ant > 0)
sample_cancer_dat <- unique(splice_maf_data_fill[,c("Tumor_Sample_ID","cancer","splice_ant")])
a<-lapply(sample_cancer_dat$Tumor_Sample_ID,function(ID){
splice_maf_data_fill_small <- splice_maf_data_fill %>% dplyr::filter(Tumor_Sample_ID == ID)
vapply(splicing_factor_genes,function(gene){
count <- length(which(splice_maf_data_fill_small$Hugo_Symbol == gene))
},numeric(1))
})
sample_splice_factor_dat <- data.frame(matrix(unlist(a),nrow=nrow(sample_cancer_dat),byrow=T))
rownames(sample_splice_factor_dat) <- sample_cancer_dat$Tumor_Sample_ID
colnames(sample_splice_factor_dat) <- splicing_factor_genes
sample_splice_factor_dat_muts <- sample_splice_factor_dat[which(apply(sample_splice_factor_dat,1,sd) > 0),]
sample_cancer_dat_muts <- sample_cancer_dat[which(apply(sample_splice_factor_dat,1,sd) > 0),]
cancer_order <- order(sample_cancer_dat_muts$cancer)
sample_splice_factor_dat_muts <- sample_splice_factor_dat_muts[cancer_order,]
sample_cancer_dat_muts <- sample_cancer_dat_muts[cancer_order,]
Heatmap(t(scale(t(sample_splice_factor_dat_muts))),
right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant)),
left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
show_row_names=F,
show_column_names = F,
cluster_rows=T,
cluster_columns=T)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
Heatmap(t(scale(t(sample_splice_factor_dat_muts))),
right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant)),
left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
show_row_names=F,
show_column_names = F,
cluster_rows=F,
cluster_columns=T)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
sample_cancer_dat_muts$sum <- apply(sample_splice_factor_dat_muts,1,sum)
for (i in unique(sample_cancer_dat_muts$cancer)){
specific_cancer <- sample_cancer_dat_muts %>% dplyr::filter(cancer == i)
print(ggplot(specific_cancer,aes(x=log10(splice_ant+1),y=log2(sum+1)))+geom_point()+labs(title=i))
}
ggplot(sample_cancer_dat_muts,aes(x=cancer,y=log2(sum+1)))+geom_boxplot()
ggplot(sample_cancer_dat_muts,aes(x=cancer,y=log10(splice_ant+1)))+geom_boxplot()
ggplot(sample_cancer_dat_muts,aes(x=log10(splice_ant+1),y=log10(sum+1)))+
geom_point()+
stat_cor(method = "spearman")+
geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
# splicemutr data needed per cancer type
# per-line counts needed per cancer type
# only keep those proteins that are longer than 9 kmers
tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
if (cancer %in% nogos){next}
splice_dat_file <- sprintf("%s/%s_splicemutr_dat.txt",tumor_dir,cancer)
splice_dat <- read.table(splice_dat_file,header=T,sep="\t")
tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
tumor_geno <- read.table(tumor_geno_file,header=T)
summary_file <- sprintf("%s/summaries.txt",tumor_dir)
summaries <- read.table(summary_file)
summaries <- summaries$V1
summaries<-unname(vapply(summaries,function(summ){
str_replace(summ,"kmers_summary","persamp_line")
},character(1)))
for (summ in seq(length(summaries))){
if (summ == 1){
summaries_combined <- read.table(summaries[summ],header=F,sep="\t")
if (length(summaries)>1){
summaries_combined <- summaries_combined[,seq(ncol(summaries_combined)-2)]
}
} else if (summ == length(summaries)){
summaries_fill <- read.table(summaries[summ],header=F,sep="\t")
summaries_combined <- cbind(summaries_combined,summaries_fill)
} else {
summaries_fill <- read.table(summaries[summ],header=F,sep="\t")
summaries_fill <- summaries_fill[,seq(ncol(summaries_fill)-2)]
summaries_combined <- cbind(summaries_combined,summaries_fill)
}
}
sample_types <- sprintf("%s_%s",tumor_geno$sample_id,tumor_geno$type)
sample_types<-c(sample_types,"row","cluster")
colnames(summaries_combined)<-sample_types
tumor_cols <- which(str_detect(sample_types,"_T"))
summaries_combined <- summaries_combined[,c(tumor_cols,length(sample_types)-1,length(sample_types))]
splice_dat_specific <- splice_dat[summaries_combined$row,]
rows_to_keep <- which(!is.na(splice_dat_specific$peptide))
summaries_combined <- summaries_combined[rows_to_keep,]
summaries_combined[is.na(summaries_combined)]<-0
splice_dat_specific <- splice_dat_specific[rows_to_keep,]
rows_to_keep <- !(splice_dat_specific$verdict == "annotated" & splice_dat_specific$modified == "changed")
summaries_combined <- summaries_combined[rows_to_keep,]
splice_dat_specific <- splice_dat_specific[rows_to_keep,]
med_summaries_combined <- apply(summaries_combined[,seq(ncol(summaries_combined)-2)],1,median)
av_summaries_combined <- apply(summaries_combined[,seq(ncol(summaries_combined)-2)],1,mean)
splice_dat_specific$median_binders_tumor <- med_summaries_combined
splice_dat_specific$mean_binders_tumor <- av_summaries_combined
saveRDS(splice_dat_specific,file=sprintf("%s/%s_splice_dat_specific.rds",tumor_dir,cancer))
genes <- unique(splice_dat_specific$gene)
per_gene_data <- data.frame(genes)
median_binders <- vapply(genes,function(g){
splice_dat_small <- splice_dat_specific %>% dplyr::filter(gene == g)
sum(splice_dat_small$median_binders_tumor)
},numeric(1))
per_gene_data$median_binders <- median_binders
ann_prop <- vapply(genes,function(g){
splice_dat_small <- splice_dat_specific %>% dplyr::filter(gene == g)
length(which(splice_dat_small$verdict == "annotated"))/nrow(splice_dat_small)
},numeric(1))
per_gene_data$ann_prop <- ann_prop
saveRDS(per_gene_data,file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
}
tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
genes <- c()
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
if (cancer %in% nogos){next}
per_gene_data<-readRDS(file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
genes <- unique(c(genes,per_gene_data$genes))
}
## [1] "1 out of 19"
## [1] "BLCA"
## [1] "2 out of 19"
## [1] "BRCA"
## [1] "3 out of 19"
## [1] "CHOL"
## [1] "4 out of 19"
## [1] "COAD"
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## [1] "8 out of 19"
## [1] "KICH"
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## [1] "11 out of 19"
## [1] "LIHC"
## [1] "12 out of 19"
## [1] "LUAD"
## [1] "13 out of 19"
## [1] "LUSC"
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## [1] "17 out of 19"
## [1] "READ"
## [1] "18 out of 19"
## [1] "THCA"
## [1] "19 out of 19"
## [1] "UCEC"
per_gene_data_tot_binders <- data.frame(genes)
per_gene_data_tot_prop <- data.frame(genes)
rownames(per_gene_data_tot_binders) <- per_gene_data_tot_binders$genes
rownames(per_gene_data_tot_prop) <- per_gene_data_tot_prop$genes
cancers <- c()
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
if (cancer %in% nogos){next}
cancers <- c(cancers,cancer)
per_gene_data<-readRDS(file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
rownames(per_gene_data)<-per_gene_data$genes
per_gene_data_tot_binders[,cancer]<- -1
per_gene_data_tot_prop[,cancer]<- -1
per_gene_data_tot_binders[per_gene_data$genes,cancer] <- per_gene_data$median_binders
per_gene_data_tot_prop[per_gene_data$genes,cancer] <- per_gene_data$ann_prop
}
## [1] "1 out of 19"
## [1] "BLCA"
## [1] "2 out of 19"
## [1] "BRCA"
## [1] "3 out of 19"
## [1] "CHOL"
## [1] "4 out of 19"
## [1] "COAD"
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## [1] "8 out of 19"
## [1] "KICH"
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## [1] "11 out of 19"
## [1] "LIHC"
## [1] "12 out of 19"
## [1] "LUAD"
## [1] "13 out of 19"
## [1] "LUSC"
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## [1] "17 out of 19"
## [1] "READ"
## [1] "18 out of 19"
## [1] "THCA"
## [1] "19 out of 19"
## [1] "UCEC"
row_ann <- unname(vapply(genes,function(gene){
if (str_detect(gene,"-")){
return("Fusion")
} else {
return("Single")
}
},character(1)))
row_ann <- data.frame(row_ann)
rownames(row_ann)<-genes
per_gene_data_tot_binders <- per_gene_data_tot_binders[,seq(2,ncol(per_gene_data_tot_binders))]
per_gene_data_tot_prop <- per_gene_data_tot_prop[,seq(2,ncol(per_gene_data_tot_prop))]
per_gene_comp <- per_gene_data_tot_binders*per_gene_data_tot_prop
Heatmap(log10(per_gene_comp+1),
right_annotation = rowAnnotation(df=row_ann),
show_row_names=F,
show_column_names = T,
cluster_rows=T,
cluster_columns=T)
## Warning: The input is a data frame, convert it to the matrix.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.